DeleteDuplicates.java example

Explorer

damp.ekeko.snippets-master
- damp.ekeko.snippets.plugin
  - src
    - damp
      - ekeko
        snippets
        BoundDirective.java
        DirectiveOperandBinding.java
        EkekoSnippetsPlugin.java
        ExtractedSnippet.java
        NaiveASTFlattener.java
        OperatorOperandBinding.java
        SnippetBaseListener.java
        SnippetBaseVisitor.java
        SnippetExtractor.java
        SnippetLexer.java
        SnippetListener.java
        SnippetParser.java
        SnippetVisitor.java
        data
        SnippetOperator.java
        TemplateGroup.java
        geneticsearch
        PartialJavaProjectModel.java
        gui
        BoundDirectivesEditorDialog.java
        BoundDirectivesViewer.java
        ChartCanvas.java
        ClojureFileEditorInput.java
        DirectiveOperandBindingEditingSupport.java
        DirectiveOperandBindingLabelProviderValue.java
        DirectiveSelectionDialog.java
        IntendedResultsEditor.java
        IntendedResultsEditorCommandHandler.java
        IntendedResultsEditorInput.java
        IntendedResultsEditorPersistableElementFactory.java
        MutationHistoryDialog.java
        OperandBindingLabelProviderDescription.java
        OperatorOperandBindingEditingSupport.java
        OperatorOperandBindingLabelProviderValue.java
        OperatorOperandsView.java
        OperatorOperandsViewer.java
        OperatorTreeContentProvider.java
        OperatorTreeLabelProvider.java
        PopulationInspectorDialog.java
        QueryInspectorDialog.java
        RecommendationEditor.java
        RecommendationEditorCommandHandler.java
        RecommendationEditorInput.java
        RecommendationEditorPersistableElementFactory.java
        RewritesTemplateEditor.java
        SubjectsTemplateEditor.java
        TemplateCodeGenerator.java
        TemplateEditor.java
        TemplateEditorActionBarContributor.java
        TemplateEditorCommandHandler.java
        TemplateEditorInput.java
        TemplateEditorPersistableElementFactory.java
        TemplateGroupNodeSelectionDialog.java
        TemplateGroupTemplateElement.java
        TemplateGroupViewer.java
        TemplateGroupViewerNodeDoubleClickListener.java
        TemplateGroupViewerNodeSelectionEvent.java
        TemplateGroupViewerNodeSelectionListener.java
        TemplatePrettyPrinter.java
        TemplateTreeContentProvider.java
        TemplateTreeLabelProviders.java
        TransformationEditor.java
        TransformationEditorActionBarContributor.java
        TransformationEditorCommandHandler.java
        TransformationEditorInput.java
        TransformationEditorPersistableElementFactory.java
        TransformationOverviewEditor.java
    - ec
      - util
        MersenneTwister.java
- damp.ekeko.snippets.plugin.test
  - resources
  - src
    - test
      - damp
        ekeko
        snippets
        EkekoSnippetsTest.java
        experiments
        GeneticSearchTest.java

/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.indexer;

import net.nutch.io.*;
import net.nutch.util.LogFormatter;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.document.Document;

import java.io.*;
import java.util.Vector;
import java.util.logging.Logger;
import java.security.MessageDigest;

/** Deletes duplicate documents in a set of Lucene indexes.
 * Duplicates have either the same contents (via MD5 hash) or the same URL.
 */
public class DeleteDuplicates {
  private static final Logger LOG =
    LogFormatter.getLogger("net.nutch.indexer.DeleteDuplicates");

  /** The key used in sorting for duplicates. */
  public static class IndexedDoc implements WritableComparable {
    private MD5Hash hash = new MD5Hash();
    private float score;
    private int index;                            // the segment index
    private int doc;                              // within the index
    private int urlLen;

    public void write(DataOutput out) throws IOException {
      hash.write(out);
      out.writeFloat(score);
      out.writeInt(index);
      out.writeInt(doc);
      out.writeInt(urlLen);
    }

    public void readFields(DataInput in) throws IOException {
      hash.readFields(in);
      this.score = in.readFloat();
      this.index = in.readInt();
      this.doc = in.readInt();
      this.urlLen = in.readInt();
    }

    public int compareTo(Object o) {
      throw new RuntimeException("this is never used");
    }

    /** Order equal hashes by decreasing score and increasing urlLen. */
    public static class ByHashScore extends WritableComparator {
      public ByHashScore() { super(IndexedDoc.class); }
      
      public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
        int c = compareBytes(b1, s1, MD5Hash.MD5_LEN, b2, s2, MD5Hash.MD5_LEN);
        if (c != 0)
          return c;

        float thisScore = readFloat(b1, s1+MD5Hash.MD5_LEN);
        float thatScore = readFloat(b2, s2+MD5Hash.MD5_LEN);

        if (thisScore < thatScore)
          return 1;
        else if (thisScore > thatScore)
          return -1;
        
        int thisUrlLen = readInt(b1, s1+MD5Hash.MD5_LEN+12);
        int thatUrlLen = readInt(b2, s2+MD5Hash.MD5_LEN+12);

        return thisUrlLen - thatUrlLen;
      }
    }

    /** Order equal hashes by decreasing index and document. */
    public static class ByHashDoc extends WritableComparator {
      public ByHashDoc() { super(IndexedDoc.class); }
      
      public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
        int c = compareBytes(b1, s1, MD5Hash.MD5_LEN, b2, s2, MD5Hash.MD5_LEN);
        if (c != 0)
          return c;

        int thisIndex = readInt(b1, s1+MD5Hash.MD5_LEN+4);
        int thatIndex = readInt(b2, s2+MD5Hash.MD5_LEN+4);

        if (thisIndex != thatIndex)
          return thatIndex - thisIndex;

        int thisDoc = readInt(b1, s1+MD5Hash.MD5_LEN+8);
        int thatDoc = readInt(b2, s2+MD5Hash.MD5_LEN+8);

        return thatDoc - thisDoc;
      }
    }
  }

  private interface Hasher {
    void updateHash(MD5Hash hash, Document doc);
  }

  private IndexReader[] readers;
  private String tempFile;

  /** Constructs a duplicate detector for the provided indexes. */
  public DeleteDuplicates(IndexReader[] readers, String tempFile) {
    this.readers = readers;
    this.tempFile = tempFile;
  }

  /** Closes the indexes, saving changes. */
  public void close() throws IOException {
    for (int i = 0; i < readers.length; i++)
      readers[i].close();
  }

  /** Delete pages with duplicate content hashes.  Of those with the same
   * content hash, keep the page with the highest score. */
  public void deleteContentDuplicates() throws IOException {
    LOG.info("Reading content hashes...");
    computeHashes(new Hasher() {
        public void updateHash(MD5Hash hash, Document doc) {
          hash.setDigest(doc.get("digest"));
        }
      });

    LOG.info("Sorting content hashes...");
    SequenceFile.Sorter byHashScoreSorter =
      new SequenceFile.Sorter(new IndexedDoc.ByHashScore(),NullWritable.class);
    byHashScoreSorter.sort(tempFile, tempFile + ".sorted");
    
    LOG.info("Deleting content duplicates...");
    int duplicateCount = deleteDuplicates();
    LOG.info("Deleted " + duplicateCount + " content duplicates.");
  }

  /** Delete pages with duplicate URLs.  Of those with the same
   * URL, keep the most recently fetched page. */
  public void deleteUrlDuplicates() throws IOException {
    final MessageDigest digest;
    try {
      digest = MessageDigest.getInstance("MD5");
    } catch (Exception e) {
      throw new RuntimeException(e.toString());
    }

    LOG.info("Reading url hashes...");
    computeHashes(new Hasher() {
        public void updateHash(MD5Hash hash, Document doc) {
          try {
            digest.update(UTF8.getBytes(doc.get("url")));
            digest.digest(hash.getDigest(), 0, MD5Hash.MD5_LEN);
          } catch (Exception e) {
            throw new RuntimeException(e.toString());
          }
        }
      });

    LOG.info("Sorting url hashes...");
    SequenceFile.Sorter byHashDocSorter =
      new SequenceFile.Sorter(new IndexedDoc.ByHashDoc(), NullWritable.class);
    byHashDocSorter.sort(tempFile, tempFile + ".sorted");
    
    LOG.info("Deleting url duplicates...");
    int duplicateCount = deleteDuplicates();
    LOG.info("Deleted " + duplicateCount + " url duplicates.");
  }

  private void computeHashes(Hasher hasher) throws IOException {
    IndexedDoc indexedDoc = new IndexedDoc();

    SequenceFile.Writer writer =
      new SequenceFile.Writer(tempFile, IndexedDoc.class, NullWritable.class);
    try {
      for (int index = 0; index < readers.length; index++) {
        IndexReader reader = readers[index];
        int readerMax = reader.maxDoc();
        indexedDoc.index = index;
        for (int doc = 0; doc < readerMax; doc++) {
          if (!reader.isDeleted(doc)) {
            Document document = reader.document(doc);
            hasher.updateHash(indexedDoc.hash, document);
            indexedDoc.score = Float.parseFloat(document.get("boost"));
            indexedDoc.doc = doc;
            indexedDoc.urlLen = document.get("url").length();
            writer.append(indexedDoc, NullWritable.get());
          }
        }
      }
    } finally {
      writer.close();
    }
  }

  private int deleteDuplicates() throws IOException {
    if (new File(tempFile).exists())
      new File(tempFile).delete();
    if (!new File(tempFile + ".sorted").renameTo(new File(tempFile)))
      throw new IOException("Couldn't rename!");

    IndexedDoc indexedDoc = new IndexedDoc();
    SequenceFile.Reader reader = new SequenceFile.Reader(tempFile);
    try {
      int duplicateCount = 0;
      MD5Hash prevHash = null;                    // previous hash
      while (reader.next(indexedDoc, NullWritable.get())) {
        if (prevHash == null) {                   // initialize prevHash
          prevHash = new MD5Hash();
          prevHash.set(indexedDoc.hash);
          continue;
        }
        if (indexedDoc.hash.equals(prevHash)) {   // found a duplicate
          readers[indexedDoc.index].delete(indexedDoc.doc); // delete it
          duplicateCount++;
        } else {
          prevHash.set(indexedDoc.hash);          // reset prevHash
        }
      }
      return duplicateCount;
    } finally {
      reader.close();
      new File(tempFile).delete();
    }
  }

  /** Delete duplicates in the indexes in the named directory. */
  public static void main(String[] args) throws Exception {
    String usage = "DeleteDuplicates <segmentsDir> <tempFile>";

    if (args.length != 2) {
      System.err.println("Usage: " + usage);
      return;
    } 

    String segmentsDir = args[0];
    String tempFile = args[1];

    File[] directories = new File(segmentsDir).listFiles();
    Vector vReaders=new Vector();
    //IndexReader[] readers = new IndexReader[directories.length];
    int maxDoc = 0;
    for (int i = 0; i < directories.length; i++) {
      File indexDone = new File(directories[i], IndexSegment.DONE_NAME);
      if(indexDone.exists() && indexDone.isFile()){
        File indexDir = new File(directories[i], "index");

      	IndexReader reader = IndexReader.open(indexDir);
        if (reader.hasDeletions()) {
          LOG.info("Clearing old deletions in " + indexDir);
          reader.undeleteAll();
        }
        maxDoc += reader.maxDoc();
        vReaders.add(reader);
      }
    }

    IndexReader[] readers=new IndexReader[vReaders.size()];
    for(int i = 0; vReaders.size()>0; i++) {
      readers[i]=(IndexReader)vReaders.remove(0);
    }

    DeleteDuplicates dd = new DeleteDuplicates(readers, tempFile);
    dd.deleteUrlDuplicates();
    dd.deleteContentDuplicates();
    dd.close();
  }
}